library(tidyverse)
library(GGally)
library(cowplot)
library(class)
library(caret)
library(e1071)
library(reshape2)
library(stringr)
attrition.df <- read.csv("CaseStudy2-data.csv", header = T)
str(attrition.df)
## 'data.frame': 870 obs. of 36 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Age : int 32 40 35 32 24 27 41 37 34 34 ...
## $ Attrition : chr "No" "No" "No" "No" ...
## $ BusinessTravel : chr "Travel_Rarely" "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" ...
## $ DailyRate : int 117 1308 200 801 567 294 1283 309 1333 653 ...
## $ Department : chr "Sales" "Research & Development" "Research & Development" "Sales" ...
## $ DistanceFromHome : int 13 14 18 1 2 10 5 10 10 10 ...
## $ Education : int 4 3 2 4 1 2 5 4 4 4 ...
## $ EducationField : chr "Life Sciences" "Medical" "Life Sciences" "Marketing" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 859 1128 1412 2016 1646 733 1448 1105 1055 1597 ...
## $ EnvironmentSatisfaction : int 2 3 3 3 1 4 2 4 3 4 ...
## $ Gender : chr "Male" "Male" "Male" "Female" ...
## $ HourlyRate : int 73 44 60 48 32 32 90 88 87 92 ...
## $ JobInvolvement : int 3 2 3 3 3 3 4 2 3 2 ...
## $ JobLevel : int 2 5 3 3 1 3 1 2 1 2 ...
## $ JobRole : chr "Sales Executive" "Research Director" "Manufacturing Director" "Sales Executive" ...
## $ JobSatisfaction : int 4 3 4 4 4 1 3 4 3 3 ...
## $ MaritalStatus : chr "Divorced" "Single" "Single" "Married" ...
## $ MonthlyIncome : int 4403 19626 9362 10422 3760 8793 2127 6694 2220 5063 ...
## $ MonthlyRate : int 9250 17544 19944 24032 17218 4809 5561 24223 18410 15332 ...
## $ NumCompaniesWorked : int 2 1 2 1 1 1 2 2 1 1 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "No" "No" ...
## $ PercentSalaryHike : int 11 14 11 19 13 21 12 14 19 14 ...
## $ PerformanceRating : int 3 3 3 3 3 4 3 3 3 3 ...
## $ RelationshipSatisfaction: int 3 1 3 3 3 3 1 3 4 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 2 0 2 0 3 1 1 ...
## $ TotalWorkingYears : int 8 21 10 14 6 9 7 8 1 8 ...
## $ TrainingTimesLastYear : int 3 2 2 3 2 4 5 5 2 3 ...
## $ WorkLifeBalance : int 2 4 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : int 5 20 2 14 6 9 4 1 1 8 ...
## $ YearsInCurrentRole : int 2 7 2 10 3 7 2 0 1 2 ...
## $ YearsSinceLastPromotion : int 0 4 2 5 1 1 0 0 0 7 ...
## $ YearsWithCurrManager : int 3 9 2 7 3 7 3 0 0 7 ...
noSalary <- read.csv("CaseStudy2CompSet(NoSalary).csv", header = T)
str(noSalary)
## 'data.frame': 300 obs. of 35 variables:
## $ ID : int 871 872 873 874 875 876 877 878 879 880 ...
## $ Age : int 43 33 55 36 27 39 33 21 30 51 ...
## $ Attrition : chr "No" "No" "Yes" "No" ...
## $ BusinessTravel : chr "Travel_Frequently" "Travel_Rarely" "Travel_Rarely" "Non-Travel" ...
## $ DailyRate : int 1422 461 267 1351 1302 895 750 251 1312 1405 ...
## $ Department : chr "Sales" "Research & Development" "Sales" "Research & Development" ...
## $ DistanceFromHome : int 2 13 13 9 19 5 22 10 23 11 ...
## $ Education : int 4 1 4 4 3 3 2 2 3 2 ...
## $ EducationField : chr "Life Sciences" "Life Sciences" "Marketing" "Life Sciences" ...
## $ EmployeeCount : int 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : int 1849 995 1372 1949 1619 42 160 1279 159 1367 ...
## $ EnvironmentSatisfaction : int 1 2 1 1 4 4 3 1 1 4 ...
## $ Gender : chr "Male" "Female" "Male" "Male" ...
## $ HourlyRate : int 92 53 85 66 67 56 95 45 96 82 ...
## $ JobInvolvement : int 3 3 4 4 2 3 3 2 1 2 ...
## $ JobLevel : int 2 1 4 1 1 2 2 1 1 4 ...
## $ JobRole : chr "Sales Executive" "Research Scientist" "Sales Executive" "Laboratory Technician" ...
## $ JobSatisfaction : int 4 4 3 2 1 4 2 3 3 2 ...
## $ MaritalStatus : chr "Married" "Single" "Single" "Married" ...
## $ MonthlyRate : int 19246 17241 9277 9238 16290 3335 15480 25308 22310 24439 ...
## $ NumCompaniesWorked : int 1 3 6 1 1 3 0 1 1 3 ...
## $ Over18 : chr "Y" "Y" "Y" "Y" ...
## $ OverTime : chr "No" "No" "Yes" "No" ...
## $ PercentSalaryHike : int 20 18 17 22 11 14 13 20 25 16 ...
## $ PerformanceRating : int 4 3 3 4 3 3 3 4 4 3 ...
## $ RelationshipSatisfaction: int 3 1 3 2 1 3 1 3 3 2 ...
## $ StandardHours : int 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : int 1 0 0 0 2 1 1 0 3 0 ...
## $ TotalWorkingYears : int 7 5 24 5 7 19 8 2 10 29 ...
## $ TrainingTimesLastYear : int 5 4 2 3 3 6 2 2 2 1 ...
## $ WorkLifeBalance : int 3 3 2 3 3 4 4 1 2 2 ...
## $ YearsAtCompany : int 7 3 19 5 7 1 7 2 10 5 ...
## $ YearsInCurrentRole : int 7 2 7 4 7 0 7 2 7 2 ...
## $ YearsSinceLastPromotion : int 7 0 3 0 0 0 0 2 0 0 ...
## $ YearsWithCurrManager : int 7 2 8 2 7 0 7 2 9 3 ...
Rcolors <- c("aquamarine3", "coral1", "plum4")
# divorce contribute to attrition
bar <- ggplot(attrition.df, aes(x=MaritalStatus, fill = MaritalStatus)) +
geom_bar() +
facet_wrap(~Attrition) +
ggtitle("Attrition Dependent on Marital Status ") +
xlab("Attrition") +
ylab("Count")
bar <- bar + scale_fill_manual(values = Rcolors)
# Yes Attrition
# make new data frames for married, divorced, and single
df1 <- attrition.df %>%
filter(Attrition == "Yes")
df1length <- NROW(df1)
df1married <- length(grep("Married", df1$MaritalStatus))
df1divorced <- length(grep("Divorced", df1$MaritalStatus))
df1single <- length(grep("Single", df1$MaritalStatus))
Yes_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df1married,df1divorced,df1single))
plot1 <- ggplot(Yes_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: Yes")
pie1 <- plot1 + coord_polar("y", start=0)
pie1 <- pie1 + scale_fill_manual(values = Rcolors)
# No Attrition
# make new data frames for married, divorced, and single
df2 <- attrition.df %>%
filter(Attrition == "No")
df2length <- NROW(df2)
df2married <- length(grep("Married", df2$MaritalStatus))
df2divorced <- length(grep("Divorced", df2$MaritalStatus))
df2single <- length(grep("Single", df2$MaritalStatus))
No_df <- data.frame(MaritalStatus = c("Married","Divorced","Single"),
Value = c(df2married,df2divorced,df2single))
plot2 <- ggplot(No_df, aes(x="", y=Value, fill=MaritalStatus)) +
geom_bar(width = 1, stat = "identity") + xlab("") + ylab("") + ggtitle("Attrition: No")
pie2 <- plot2 + coord_polar("y", start=0)
pie2 <- pie2 + scale_fill_manual(values = Rcolors)
piecharts <- plot_grid(pie2,pie1, ncol = 2, labels = c("B","C"))
allplots <- plot_grid(bar,piecharts, nrow = 2, labels = "A")
allplots
ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
ggtitle("Monthly Income Based on Attrition") +
xlab("Monthly Income") +
ylab("Count") +
scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
ggplot(attrition.df, aes(x=MonthlyIncome, fill = Attrition)) +
geom_histogram() +
facet_wrap(~JobRole) +
ggtitle("Monthly Income Based on Job Role") +
xlab("Monthly Income") +
ylab("Count") +
scale_fill_manual(values = Rcolors)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
EDA1 <- ggplot(attrition.df, aes(x=RelationshipSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Relationship Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA2 <- ggplot(attrition.df, aes(x=JobLevel, fill = Attrition)) +
geom_bar(position = "fill") +
#facet_wrap(~JobRole) +
ggtitle("Job Level") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA3 <- ggplot(attrition.df, aes(x=JobSatisfaction, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Job Satisfaction") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA4 <- ggplot(attrition.df, aes(x=YearsWithCurrManager, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years With Current Manager") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA5 <- ggplot(attrition.df, aes(x=YearsSinceLastPromotion, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years Since Last Promotion") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA6 <- ggplot(attrition.df, aes(x=YearsAtCompany, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years At Company") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA7 <- ggplot(attrition.df, aes(x=YearsInCurrentRole, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Years in Current Role") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA8 <- ggplot(attrition.df, aes(x=NumCompaniesWorked, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Companies Worked") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA9 <- ggplot(attrition.df, aes(x=MaritalStatus, fill = Attrition)) +
geom_bar(position = "fill") +
ggtitle("Marital Status") +
ylab("% Count") +
scale_y_continuous(labels = scales::percent) +
theme(legend.position = "none") +
scale_fill_manual(values = Rcolors)
EDA10 <- plot_grid(EDA1,EDA2,EDA3,EDA4,EDA5,EDA6,EDA7,EDA8,EDA9, ncol = 3, nrow = 3)
EDA10
EDA.A <- ggplot(attrition.df, aes(x=TotalWorkingYears, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Total Working Years vs Percent Salary Hike") +
scale_color_manual(values = Rcolors)
EDA.B <- ggplot(attrition.df, aes(x=YearsWithCurrManager, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Years With Current Manager vs Percent Salary Hike") +
scale_color_manual(values = Rcolors)
EDA.C <-ggplot(attrition.df, aes(x=YearsAtCompany, y=YearsInCurrentRole, color = Attrition)) +
geom_point() +
ggtitle("Years At Company vs Years In CurrentRole") +
scale_color_manual(values = Rcolors)
EDA.D <- ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
geom_point() +
ggtitle("Age vs Monthly Income") +
scale_color_manual(values = Rcolors)
EDA.E <- ggplot(attrition.df, aes(x=Age, y=YearsSinceLastPromotion, color = Attrition)) +
geom_point() +
ggtitle("Age vs Job YearsSinceLastPromotion") +
scale_color_manual(values = Rcolors)
EDA.F <- ggplot(attrition.df, aes(x=Age, y=PercentSalaryHike, color = Attrition)) +
geom_point() +
ggtitle("Age vs Percent Salary Hike") +
scale_color_manual(values = Rcolors)
EDA.G <- plot_grid(EDA.A,EDA.B,EDA.C,EDA.D,EDA.E,EDA.F, ncol = 2, nrow = 3)
EDA.G
ggpairs(attrition.df, columns = c(2,25,35,36,3), aes(color = Attrition))
ggplot(attrition.df, aes(x=Age, y=MonthlyIncome, color = Attrition)) +
geom_point() +
facet_wrap(~JobRole) +
scale_color_manual(values = Rcolors)
Conclusion: This model does not have enough “Yes” attrition data to be able to use. Internal may prove more able to handle this skewed data.
set.seed(9)
splitPerc <- .85
trainIndices <- sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
dfTrain <- attrition.df[trainIndices,]
dfTest <- attrition.df[-trainIndices,]
dfTrain <- na.omit(dfTrain)
dfTest <- na.omit(dfTest)
# 2:Age, 18:JobSatisfaction, 20:MontlyIncome, 25:PercentSalaryHike,
# 30:TotalWorkingYears, 30:TotalworkingYears, 33:YearsAtCompany,
# 34:YearsInCurrentRole, 35:YearsSinceLastPromotion, 36:YearsWithCurrManager
# knn model
classifications <- knn(dfTrain[,c(2,35)], dfTest[,c(2,35)], dfTrain$Attrition,
prob = TRUE, k = 10)
table(dfTest$Attrition,classifications)
## classifications
## No Yes
## No 106 0
## Yes 22 2
confusionMatrix(table(dfTest$Attrition,classifications))
## Confusion Matrix and Statistics
##
## classifications
## No Yes
## No 106 0
## Yes 22 2
##
## Accuracy : 0.8308
## 95% CI : (0.7551, 0.8908)
## No Information Rate : 0.9846
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1291
##
## Mcnemar's Test P-Value : 7.562e-06
##
## Sensitivity : 0.82812
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.08333
## Prevalence : 0.98462
## Detection Rate : 0.81538
## Detection Prevalence : 0.81538
## Balanced Accuracy : 0.91406
##
## 'Positive' Class : No
##
# Loop for many k and the average of many training / test partition
iterations = 100
numks = 50
masterAcc = matrix(nrow = iterations, ncol = numks)
for(j in 1:iterations)
{
accs = data.frame(accuracy = numeric(100), k = numeric(100))
trainIndices = sample(1:dim(attrition.df)[1],round(splitPerc * dim(attrition.df)[1]))
train = attrition.df[trainIndices,]
test = attrition.df[-trainIndices,]
train = na.omit(train)
test = na.omit(test)
for(i in 1:numks)
{
classifications = knn(train[,c(2,35)],test[,c(2,35)],train$Attrition, prob = TRUE, k = i)
table(classifications,test$Attrition)
CM = confusionMatrix(table(classifications,test$Attrition))
masterAcc[j,i] = CM$overall[1]
}
}
MeanAcc = colMeans(masterAcc)
plot(seq(1,numks,1),MeanAcc, type = "l", xlab = "k", ylab = "Mean Accuracy", main = "Mean Accuracy of k Values")
# Internal Model
classifications1 <- knn.cv(dfTrain[,c(2,35)],dfTrain$Attrition, k = 20)
confusionMatrix(table(classifications1,dfTrain$Attrition))
## Confusion Matrix and Statistics
##
##
## classifications1 No Yes
## No 618 107
## Yes 6 9
##
## Accuracy : 0.8473
## 95% CI : (0.8193, 0.8725)
## No Information Rate : 0.8432
## P-Value [Acc > NIR] : 0.4044
##
## Kappa : 0.1053
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.99038
## Specificity : 0.07759
## Pos Pred Value : 0.85241
## Neg Pred Value : 0.60000
## Prevalence : 0.84324
## Detection Rate : 0.83514
## Detection Prevalence : 0.97973
## Balanced Accuracy : 0.53399
##
## 'Positive' Class : No
##
NBmodel <- naiveBayes(dfTrain[,c(24,20)],dfTrain$Attrition,laplace = 1)
table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition)
##
## No Yes
## No 106 24
## Yes 0 0
confusionMatrix(table(predict(NBmodel,dfTest[,c(24,20)]),dfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 106 24
## Yes 0 0
##
## Accuracy : 0.8154
## 95% CI : (0.7379, 0.878)
## No Information Rate : 0.8154
## P-Value [Acc > NIR] : 0.5543
##
## Kappa : 0
##
## Mcnemar's Test P-Value : 2.668e-06
##
## Sensitivity : 1.0000
## Specificity : 0.0000
## Pos Pred Value : 0.8154
## Neg Pred Value : NaN
## Prevalence : 0.8154
## Detection Rate : 0.8154
## Detection Prevalence : 1.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : No
##
NBmodel1 = naiveBayes(Attrition~.,data = dfTrain)
table(predict(NBmodel1,dfTest))
##
## No Yes
## 102 28
confusionMatrix(table(predict(NBmodel1,dfTest),dfTest$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 96 6
## Yes 10 18
##
## Accuracy : 0.8769
## 95% CI : (0.8078, 0.928)
## No Information Rate : 0.8154
## P-Value [Acc > NIR] : 0.0401
##
## Kappa : 0.616
##
## Mcnemar's Test P-Value : 0.4533
##
## Sensitivity : 0.9057
## Specificity : 0.7500
## Pos Pred Value : 0.9412
## Neg Pred Value : 0.6429
## Prevalence : 0.8154
## Detection Rate : 0.7385
## Detection Prevalence : 0.7846
## Balanced Accuracy : 0.8278
##
## 'Positive' Class : No
##
# Make new dataframe and make train and test sets
set.seed(9)
new.df <- attrition.df[,c(2,3,4,5,6,7,15,16,18,20,29,30,32,33,34,35)]
splitPerc <- .7
trainIndices <- sample(1:dim(new.df)[1],round(splitPerc * dim(new.df)[1]))
dfTrain1 <- new.df[trainIndices,]
dfTest1 <- new.df[-trainIndices,]
dfTrain1 <- na.omit(dfTrain1)
dfTest1 <- na.omit(dfTest1)
# Naive Bayes Model for New Dataframe
new.dfmodel = naiveBayes(Attrition~.,data = dfTrain1)
confusionMatrix(table(predict(new.dfmodel,dfTest1),dfTest1$Attrition))
## Confusion Matrix and Statistics
##
##
## No Yes
## No 206 20
## Yes 17 18
##
## Accuracy : 0.8582
## 95% CI : (0.8099, 0.8982)
## No Information Rate : 0.8544
## P-Value [Acc > NIR] : 0.4733
##
## Kappa : 0.4109
##
## Mcnemar's Test P-Value : 0.7423
##
## Sensitivity : 0.9238
## Specificity : 0.4737
## Pos Pred Value : 0.9115
## Neg Pred Value : 0.5143
## Prevalence : 0.8544
## Detection Rate : 0.7893
## Detection Prevalence : 0.8659
## Balanced Accuracy : 0.6987
##
## 'Positive' Class : No
##
Not sure if this is actually correct to use for this kind of data
# change attrition variable to numberic for use in linear regression model
attrition.lm <- attrition.df
attrition.lm$Attrition <- gsub("Yes", 1, attrition.lm$Attrition)
attrition.lm$Attrition <- gsub("No", 0, attrition.lm$Attrition)
attrition.lm$Attrition <- as.numeric(attrition.lm$Attrition)
# run model
attritionfit = lm(Attrition~Age*MonthlyIncome, data = attrition.lm)
attritionfit1 = lm(Attrition~Age+MonthlyIncome, data = attrition.lm)
attritionfit2 = lm(Attrition~MonthlyIncome, data = attrition.lm)
attritionfit3 = lm(Attrition~Age, data = attrition.lm)
# inormation about model
summary(attritionfit)
##
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = attrition.lm)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.37829 -0.19156 -0.13330 -0.06808 1.02288
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.202e-01 8.883e-02 6.981 5.83e-12 ***
## Age -1.030e-02 2.353e-03 -4.379 1.34e-05 ***
## MonthlyIncome -5.761e-05 1.400e-05 -4.114 4.26e-05 ***
## Age:MonthlyIncome 1.130e-06 3.153e-07 3.583 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3599 on 866 degrees of freedom
## Multiple R-squared: 0.04538, Adjusted R-squared: 0.04207
## F-statistic: 13.72 on 3 and 866 DF, p-value: 9.485e-09
confint(attritionfit)
## 2.5 % 97.5 %
## (Intercept) 4.457982e-01 7.945094e-01
## Age -1.491937e-02 -5.684270e-03
## MonthlyIncome -8.508807e-05 -3.012455e-05
## Age:MonthlyIncome 5.109013e-07 1.748437e-06
attritionfit$coefficients
## (Intercept) Age MonthlyIncome Age:MonthlyIncome
## 6.201538e-01 -1.030182e-02 -5.760631e-05 1.129669e-06
hist(attritionfit$residuals, col = "blue", main = "Histogram of Residuals")
sqrt(mean(attritionfit$residuals^2))
## [1] 0.3590231
sum((attritionfit$residuals)^2)
## [1] 112.1409
set.seed(9)
splitPerc <- .8
trainIndices <- sample(1:dim(attrition.lm)[1],round(splitPerc * dim(attrition.lm)[1]))
dfTrain2 <- attrition.lm[trainIndices,]
dfTest2 <- attrition.lm[-trainIndices,]
dfTrain2 <- na.omit(dfTrain2)
dfTest2 <- na.omit(dfTest2)
# Best LM model for Attrition
LR.fit <- lm(Attrition~Age*MonthlyIncome, data = dfTrain2)
summary(LR.fit)
##
## Call:
## lm(formula = Attrition ~ Age * MonthlyIncome, data = dfTrain2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.38770 -0.19212 -0.13098 -0.06281 0.97511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.504e-01 9.709e-02 6.698 4.37e-11 ***
## Age -1.093e-02 2.578e-03 -4.238 2.56e-05 ***
## MonthlyIncome -6.789e-05 1.542e-05 -4.403 1.24e-05 ***
## Age:MonthlyIncome 1.351e-06 3.464e-07 3.900 0.000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3576 on 692 degrees of freedom
## Multiple R-squared: 0.05128, Adjusted R-squared: 0.04716
## F-statistic: 12.47 on 3 and 692 DF, p-value: 6.013e-08
LR.Preds <- predict(LR.fit, newdata = dfTest2)
# Metrics for model
MSPE <- data.frame(Observed = dfTest2$Attrition, Predicted = LR.Preds)
MSPE$Resisdual <- MSPE$Observed - MSPE$Predicted
MSPE$SquaredResidual <- MSPE$Resisdual^2
mean(MSPE$SquaredResidual)
## [1] 0.1362915
RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
RMSE
## [1] 0.3691768
attrition.jl <- attrition.df
attrition.jl$JobLevel <- gsub(1, "1", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(2, "2", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(3, "3", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(4, "4", attrition.jl$JobLevel)
attrition.jl$JobLevel <- gsub(5, "5", attrition.jl$JobLevel)
attrition.jl$JobLevel <- as.character(attrition.jl$JobLevel)
# Job level model
JLModel_fit = lm(MonthlyIncome~JobLevel, data = attrition.jl)
summary(JLModel_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.jl)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4642.2 -668.0 -107.3 668.3 4412.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2743.82 69.69 39.37 <2e-16 ***
## JobLevel2 2800.46 99.89 28.04 <2e-16 ***
## JobLevel3 7108.38 130.24 54.58 <2e-16 ***
## JobLevel4 12509.83 177.45 70.50 <2e-16 ***
## JobLevel5 16480.15 219.18 75.19 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9244
## F-statistic: 2658 on 4 and 865 DF, p-value: < 2.2e-16
preds <- predict(JLModel_fit)
hist(JLModel_fit$residuals, col = "blue", main = "Histogram of Residuals")
plot(JLModel_fit$fitted.values,JLModel_fit$residuals, main = "Plot of Residuals v. Fitted Values")
attrition.jl %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attrition.jl, aes(x = JobLevel, y = preds, col = "red"))
Model1_fit = lm(MonthlyIncome~JobLevel, data = attrition.df)
summary(Model1_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel, data = attrition.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5037.1 -928.2 80.1 697.1 3723.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1793.93 101.68 -17.64 <2e-16 ***
## JobLevel 4013.67 43.98 91.26 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1413 on 868 degrees of freedom
## Multiple R-squared: 0.9056, Adjusted R-squared: 0.9055
## F-statistic: 8329 on 1 and 868 DF, p-value: < 2.2e-16
hist(Model1_fit$residuals, col = "blue", main = "Histogram of Residuals")
SalaryPredict <- predict(Model1_fit, noSalary, interval = "confidence")
SalaryPredict <- as.data.frame(SalaryPredict)
noSalary$MonthlyIncome <- SalaryPredict[,1]
#predicted values plotted
noSalary %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = noSalary, aes(x = JobLevel, y = MonthlyIncome, col = "red")) +
ggtitle("Predicted Values using Single Varialbe Model") +
scale_color_manual(values = Rcolors)
# distribution of responses
ggplot(noSalary, aes(x=MonthlyIncome)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# Good Exponential Model
attriton.df2 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2))
fit = lm(MonthlyIncome~JobLevel+JobLevel2, attriton.df2)
summary(fit)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2, data = attriton.df2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4502.5 -744.2 -131.4 656.1 4177.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 299.75 190.65 1.572 0.116
## JobLevel 1944.17 169.12 11.496 <2e-16 ***
## JobLevel2 397.80 31.56 12.603 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1300 on 867 degrees of freedom
## Multiple R-squared: 0.9202, Adjusted R-squared: 0.92
## F-statistic: 5001 on 2 and 867 DF, p-value: < 2.2e-16
preds2 <- predict(fit)
hist(fit$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit$fitted.values,fit$residuals, main = "Plot of Residuals v. Fitted Values")
# show model with predicted values
attriton.df2 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attriton.df2, aes(x = JobLevel, y = preds2, col = "red")) +
scale_color_manual(values = Rcolors)
# use model to predict data with no identifiers
noSalary <- noSalary %>% mutate(JobLevel2 = (JobLevel^2))
test2 <- predict(fit, noSalary, interval = "confidence")
test2 <- as.data.frame(test2)
# predicted values plotted
noSalary$Salary <- test2[,1]
noSalary %>% ggplot(aes(x = JobLevel, y = Salary)) +
geom_point() +
geom_line(data = noSalary, aes(x = JobLevel, y = Salary , col = "red")) +
scale_color_manual(values = Rcolors)
# Adding extra exponential variable helped
attriton.df3 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2),
JobLevel3 = (JobLevel^3))
fit2 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3, attriton.df3)
summary(fit2)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3,
## data = attriton.df3)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4796.8 -669.9 -108.7 652.2 4456.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3012.60 435.69 6.915 9.11e-12 ***
## JobLevel -2176.07 620.83 -3.505 0.00048 ***
## JobLevel2 2125.20 252.82 8.406 < 2e-16 ***
## JobLevel3 -207.57 30.15 -6.884 1.12e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1267 on 866 degrees of freedom
## Multiple R-squared: 0.9244, Adjusted R-squared: 0.9241
## F-statistic: 3528 on 3 and 866 DF, p-value: < 2.2e-16
preds3 <- predict(fit2)
hist(fit2$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit2$fitted.values,fit2$residuals, main = "Plot of Residuals v. Fitted Values")
# plot with predicted values on original dataset
attriton.df3 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attriton.df3, aes(x = JobLevel, y = preds3, col = "red"))
# model with exponential variable
noSalary <- noSalary %>% mutate(JobLevel3 = (JobLevel^3))
test3 <- predict(fit2, noSalary, interval = "confidence")
test3 <- as.data.frame(test3)
noSalary$Salary2 <- test3[,1]
# predicted values plotted
noSalary %>% ggplot(aes(x = JobLevel, y = Salary2)) +
geom_point() +
geom_line(data = noSalary, aes(x = JobLevel, y = Salary2 , col = "red")) +
scale_color_manual(values = Rcolors)
# Adding extra exponential variable hurt
attriton.df4 = attrition.df %>% mutate(JobLevel2 = (JobLevel^2),
JobLevel3 = (JobLevel^3),
JobLevel4 = (JobLevel^4))
fit3 = lm(MonthlyIncome~JobLevel+JobLevel2+JobLevel3+JobLevel4, attriton.df4)
summary(fit3)
##
## Call:
## lm(formula = MonthlyIncome ~ JobLevel + JobLevel2 + JobLevel3 +
## JobLevel4, data = attriton.df4)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4642.2 -668.0 -107.3 668.3 4412.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -245.90 1597.80 -0.154 0.8777
## JobLevel 4177.61 3061.29 1.365 0.1727
## JobLevel2 -1910.41 1920.81 -0.995 0.3202
## JobLevel3 810.46 481.29 1.684 0.0926 .
## JobLevel4 -87.95 41.50 -2.119 0.0343 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1264 on 865 degrees of freedom
## Multiple R-squared: 0.9248, Adjusted R-squared: 0.9244
## F-statistic: 2658 on 4 and 865 DF, p-value: < 2.2e-16
preds3 <- predict(fit3)
hist(fit3$residuals, col = "blue", main = "Histogram of Residuals")
plot(fit3$fitted.values,fit3$residuals, main = "Plot of Residuals v. Fitted Values")
attriton.df4 %>% ggplot(aes(x = JobLevel, y = MonthlyIncome)) +
geom_point() +
geom_line(data = attriton.df4, aes(x = JobLevel, y = preds3, col = "red")) +
scale_color_manual(values = Rcolors)
# best model !!!
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole, data = attrition.df)
summary(Model2_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole, data = attrition.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3639.3 -671.7 -46.5 634.8 4097.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -564.822 249.203 -2.267 0.02367 *
## Age 13.701 4.729 2.898 0.00386 **
## JobLevel 3032.839 69.606 43.571 < 2e-16 ***
## JobRoleHuman Resources -324.725 255.782 -1.270 0.20459
## JobRoleLaboratory Technician -526.524 171.357 -3.073 0.00219 **
## JobRoleManager 3968.183 232.376 17.077 < 2e-16 ***
## JobRoleManufacturing Director 85.832 169.605 0.506 0.61294
## JobRoleResearch Director 3941.604 217.548 18.118 < 2e-16 ***
## JobRoleResearch Scientist -249.221 171.560 -1.453 0.14668
## JobRoleSales Executive -127.854 146.073 -0.875 0.38167
## JobRoleSales Representative -404.623 215.498 -1.878 0.06077 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1080 on 859 degrees of freedom
## Multiple R-squared: 0.9455, Adjusted R-squared: 0.9448
## F-statistic: 1490 on 10 and 859 DF, p-value: < 2.2e-16
preds4 <- predict(Model2_fit)
hist(Model2_fit$residuals, col = "blue", main = "Histogram of Residuals")
sqrt(sum((Model2_fit$residuals)^2))
## [1] 31647.6
# predict with this model
preds5 <- predict(Model2_fit, noSalary)
preds5<- as.data.frame(preds5)
noSalary$Salary3 <- preds5[,1]
# plot predicted values
noSalary %>% ggplot(aes(x = Age, y = Salary3)) +
geom_point() +
#geom_line(data = noSalary, aes(x = Age, y = Salary3 , col = "red")) +
scale_color_manual(values = Rcolors) +
ggtitle("Predicted Salary by Age") +
ylab("Salary")
#write table
write.csv(noSalary, file = "Case2PredictionsBoyd_Pandey Salary.csv", row.names=FALSE)
# decent model
Model2_fit = lm(MonthlyIncome~Age+JobLevel+JobRole+JobSatisfaction, data = attrition.df)
summary(Model2_fit)
##
## Call:
## lm(formula = MonthlyIncome ~ Age + JobLevel + JobRole + JobSatisfaction,
## data = attrition.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3630.4 -666.4 -39.3 631.2 4094.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -598.056 267.029 -2.240 0.02537 *
## Age 13.700 4.731 2.896 0.00388 **
## JobLevel 3033.178 69.649 43.550 < 2e-16 ***
## JobRoleHuman Resources -321.176 256.116 -1.254 0.21018
## JobRoleLaboratory Technician -524.491 171.544 -3.057 0.00230 **
## JobRoleManager 3971.223 232.659 17.069 < 2e-16 ***
## JobRoleManufacturing Director 87.033 169.727 0.513 0.60823
## JobRoleResearch Director 3945.016 217.881 18.106 < 2e-16 ***
## JobRoleResearch Scientist -248.490 171.661 -1.448 0.14810
## JobRoleSales Executive -126.622 146.191 -0.866 0.38665
## JobRoleSales Representative -402.660 215.682 -1.867 0.06225 .
## JobSatisfaction 11.475 33.006 0.348 0.72819
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1080 on 858 degrees of freedom
## Multiple R-squared: 0.9455, Adjusted R-squared: 0.9448
## F-statistic: 1353 on 11 and 858 DF, p-value: < 2.2e-16
Model2_Preds = predict(Model2_fit, newdata = noSalary)
# Didn't seem to match the values given by the model
# MSPE = data.frame(Observed = dfTest$MonthlyIncome, Predicted = Model2_Preds)
# MSPE$Resisdual = MSPE$Observed - MSPE$Predicted
# MSPE$SquaredResidual = MSPE$Resisdual^2
# MSPE
# mean(MSPE$SquaredResidual)
# RMSE <- mean((MSPE$Observed - MSPE$Predicted)^2) %>% sqrt()
# RMSE